{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Model selection is the task of selecting a statistical model from a set of candidate models, given data. In the simplest cases, a pre-existing set of data is considered. Given candidate models of similar predictive or explanatory power, the simplest model is most likely to be the best choice."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The data is available in Google BigQuery that can be downloaded from here. The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from numpy import random\n",
    "import gensim\n",
    "import nltk\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
    "import matplotlib.pyplot as plt\n",
    "from nltk.corpus import stopwords\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>post</th>\n",
       "      <th>tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>what is causing this behavior  in our c# datet...</td>\n",
       "      <td>c#</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>have dynamic html load as if it was in an ifra...</td>\n",
       "      <td>asp.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>how to convert a float value in to min:sec  i ...</td>\n",
       "      <td>objective-c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.net framework 4 redistributable  just wonderi...</td>\n",
       "      <td>.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>trying to calculate and print the mean and its...</td>\n",
       "      <td>python</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>how to give alias name for my website  i have ...</td>\n",
       "      <td>asp.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>window.open() returns null in angularjs  it wo...</td>\n",
       "      <td>angularjs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>identifying server timeout quickly in iphone  ...</td>\n",
       "      <td>iphone</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>unknown method key  error in rails 2.3.8 unit ...</td>\n",
       "      <td>ruby-on-rails</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>from the include  how to show and hide the con...</td>\n",
       "      <td>angularjs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                post           tags\n",
       "0  what is causing this behavior  in our c# datet...             c#\n",
       "1  have dynamic html load as if it was in an ifra...        asp.net\n",
       "2  how to convert a float value in to min:sec  i ...    objective-c\n",
       "3  .net framework 4 redistributable  just wonderi...           .net\n",
       "4  trying to calculate and print the mean and its...         python\n",
       "5  how to give alias name for my website  i have ...        asp.net\n",
       "6  window.open() returns null in angularjs  it wo...      angularjs\n",
       "7  identifying server timeout quickly in iphone  ...         iphone\n",
       "8  unknown method key  error in rails 2.3.8 unit ...  ruby-on-rails\n",
       "9  from the include  how to show and hide the con...      angularjs"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('stack-overflow-data.csv')\n",
    "df = df[pd.notnull(df['tags'])]\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10276752"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['post'].apply(lambda x: len(x.split(' '))).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have over 10 million words in the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmAAAAExCAYAAADbUR4fAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xm0ZGV57/HvD3BWFKVRZJbggEYBWyRxnoGoqIlKJyrBgaiYOCQmam4CEl1JNOpVr6IoICpKQPSKBkVEBlERmxkFrji3IKA4EFEM8Nw/3l129aF64Jza+/Q5/f2sdVZXvbWrnl2nT1U99Q7Pm6pCkiRJw9lovk9AkiRpQ2MCJkmSNDATMEmSpIGZgEmSJA3MBEySJGlgJmCSJEkDMwGTJEkamAmYJEnSwEzAJEmSBrbJfJ/A2my++ea1/fbbz/dpSJIkrdU555zz06pasrbj1vsEbPvtt2f58uXzfRqSJElrleQH63KcQ5CSJEkDMwGTJEkamAmYJEnSwEzAJEmSBmYCJkmSNLC1JmBJtklyapJLknwzySu79rsnOTnJt7t/N+vak+RdSS5PcmGS3cYea7/u+G8n2a+/pyVJkrT+WpcesBuBv62qBwB7AAcm2Rl4HXBKVe0EnNJdB9gL2Kn7OQA4FFrCBhwEPBzYHTholLRJkiRtSNaagFXVlVV1bnf5OuASYCtgH+Co7rCjgGd0l/cBPlzNWcDdkmwJPAU4uaquraqfAycDe0712UiSJC0At6oQa5LtgV2BrwP3rKoroSVpSbboDtsK+NHY3VZ0batrnxTnAFrvGdtuu+1qz2f71/3XrTn93/v+v/3Jrb/TwXedVSwO/uWs7vaHR/3hrO530X4Xzep+l9z/AbO63wMuvWRW93vPS780q/sd+L7Hz+p+b3vuU2d1v7/9z8/O6n4rXvflWd1v63971Kzud/DBBw96v1O+tOOs7veEx39nVve716nnz+p+P3ncLrO635DvLYO+j4HvZavhe9lkvpdNNtv3snHrPAk/yZ2B44FXVdWv1nTohLZaQ/stG6sOq6qlVbV0yZK1VvOXJElaUNYpAUtyG1rydXRVfbJrvqobWqT79+qufQWwzdjdtwauWEO7JEnSBmVdVkEGOBy4pKrePnbTCcBoJeN+wKfH2l/QrYbcA/hlN1R5EvDkJJt1k++f3LVJkiRtUNZlDtgjgOcDFyUZTcR4A/BvwLFJXgT8EHh2d9uJwN7A5cD1wP4AVXVtkn8BvtEdd0hVXTuVZyFJkrSArDUBq6ozmTx/C+AJE44v4MDVPNYRwBG35gQlSZIWGyvhS5IkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDWytCViSI5JcneTisbb/THJ+9/P9JOd37dsn+c3Ybe8bu89Dk1yU5PIk70qSfp6SJEnS+m2TdTjmQ8D/AT48aqiq544uJ3kb8Mux479TVbtMeJxDgQOAs4ATgT2Bz936U5YkSVrY1toDVlVnANdOuq3rxXoO8PE1PUaSLYFNq+prVVW0ZO4Zt/50JUmSFr65zgF7FHBVVX17rG2HJOclOT3Jo7q2rYAVY8es6NomSnJAkuVJll9zzTVzPEVJkqT1y1wTsGWs2vt1JbBtVe0KvAb4WJJNgUnzvWp1D1pVh1XV0qpaumTJkjmeoiRJ0vplXeaATZRkE+BZwENHbVV1A3BDd/mcJN8B7kvr8dp67O5bA1fMNrYkSdJCNpcesCcCl1bV74cWkyxJsnF3+T7ATsB3q+pK4Loke3Tzxl4AfHoOsSVJkhasdSlD8XHga8D9kqxI8qLupn255eT7RwMXJrkA+ATw0qoaTeB/GfBB4HLgO7gCUpIkbaDWOgRZVctW0/6XE9qOB45fzfHLgQfdyvOTJEladKyEL0mSNDATMEmSpIGZgEmSJA3MBEySJGlgJmCSJEkDMwGTJEkamAmYJEnSwEzAJEmSBmYCJkmSNDATMEmSpIGZgEmSJA3MBEySJGlgJmCSJEkDMwGTJEkamAmYJEnSwEzAJEmSBmYCJkmSNLC1JmBJjkhydZKLx9oOTvLjJOd3P3uP3fb6JJcnuSzJU8ba9+zaLk/yuuk/FUmSpIVhXXrAPgTsOaH9HVW1S/dzIkCSnYF9gQd293lvko2TbAy8B9gL2BlY1h0rSZK0wdlkbQdU1RlJtl/Hx9sHOKaqbgC+l+RyYPfutsur6rsASY7pjv3WrT5jSZKkBW4uc8BekeTCbohys65tK+BHY8es6NpW1z5RkgOSLE+y/JprrpnDKUqSJK1/ZpuAHQrsCOwCXAm8rWvPhGNrDe0TVdVhVbW0qpYuWbJklqcoSZK0flrrEOQkVXXV6HKSDwCf7a6uALYZO3Rr4Iru8uraJUmSNiiz6gFLsuXY1WcCoxWSJwD7Jrldkh2AnYCzgW8AOyXZIcltaRP1T5j9aUuSJC1ca+0BS/Jx4LHA5klWAAcBj02yC20Y8fvAXwFU1TeTHEubXH8jcGBV3dQ9ziuAk4CNgSOq6ptTfzaSJEkLwLqsglw2ofnwNRz/ZuDNE9pPBE68VWcnSZK0CFkJX5IkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWBrTcCSHJHk6iQXj7W9NcmlSS5M8qkkd+vat0/ymyTndz/vG7vPQ5NclOTyJO9Kkn6ekiRJ0vptXXrAPgTsOaPtZOBBVfVg4P8Brx+77TtVtUv389Kx9kOBA4Cdup+ZjylJkrRBWGsCVlVnANfOaPtCVd3YXT0L2HpNj5FkS2DTqvpaVRXwYeAZsztlSZKkhW0ac8BeCHxu7PoOSc5LcnqSR3VtWwErxo5Z0bVNlOSAJMuTLL/mmmumcIqSJEnrjzklYEn+EbgROLpruhLYtqp2BV4DfCzJpsCk+V61usetqsOqamlVLV2yZMlcTlGSJGm9s8ls75hkP+CpwBO6YUWq6gbghu7yOUm+A9yX1uM1Pky5NXDFbGNLkiQtZLPqAUuyJ/APwNOr6vqx9iVJNu4u34c22f67VXUlcF2SPbrVjy8APj3ns5ckSVqA1toDluTjwGOBzZOsAA6irXq8HXByV03irG7F46OBQ5LcCNwEvLSqRhP4X0ZbUXkH2pyx8XljkiRJG4y1JmBVtWxC8+GrOfZ44PjV3LYceNCtOjtJkqRFyEr4kiRJAzMBkyRJGpgJmCRJ0sBMwCRJkgZmAiZJkjQwEzBJkqSBmYBJkiQNzARMkiRpYCZgkiRJAzMBkyRJGpgJmCRJ0sBMwCRJkgZmAiZJkjQwEzBJkqSBmYBJkiQNzARMkiRpYCZgkiRJA1unBCzJEUmuTnLxWNvdk5yc5Nvdv5t17UnyriSXJ7kwyW5j99mvO/7bSfab/tORJEla/61rD9iHgD1ntL0OOKWqdgJO6a4D7AXs1P0cABwKLWEDDgIeDuwOHDRK2iRJkjYk65SAVdUZwLUzmvcBjuouHwU8Y6z9w9WcBdwtyZbAU4CTq+raqvo5cDK3TOokSZIWvbnMAbtnVV0J0P27Rde+FfCjseNWdG2ra7+FJAckWZ5k+TXXXDOHU5QkSVr/9DEJPxPaag3tt2ysOqyqllbV0iVLlkz15CRJkubbXBKwq7qhRbp/r+7aVwDbjB23NXDFGtolSZI2KHNJwE4ARisZ9wM+Pdb+gm415B7AL7shypOAJyfZrJt8/+SuTZIkaYOyyboclOTjwGOBzZOsoK1m/Dfg2CQvAn4IPLs7/ERgb+By4Hpgf4CqujbJvwDf6I47pKpmTuyXJEla9NYpAauqZau56QkTji3gwNU8zhHAEet8dpIkSYuQlfAlSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLAZp2AJblfkvPHfn6V5FVJDk7y47H2vcfu8/oklye5LMlTpvMUJEmSFpZNZnvHqroM2AUgycbAj4FPAfsD76iq/xg/PsnOwL7AA4F7A19Mct+qumm25yBJkrQQTWsI8gnAd6rqB2s4Zh/gmKq6oaq+B1wO7D6l+JIkSQvGtBKwfYGPj11/RZILkxyRZLOubSvgR2PHrOjaJEmSNihzTsCS3BZ4OnBc13QosCNtePJK4G2jQyfcvVbzmAckWZ5k+TXXXDPXU5QkSVqvTKMHbC/g3Kq6CqCqrqqqm6rqZuADrBxmXAFsM3a/rYErJj1gVR1WVUuraumSJUumcIqSJEnrj2kkYMsYG35MsuXYbc8ELu4unwDsm+R2SXYAdgLOnkJ8SZKkBWXWqyABktwReBLwV2PNb0myC2148fuj26rqm0mOBb4F3Agc6ApISZK0IZpTAlZV1wP3mNH2/DUc/2bgzXOJKUmStNBZCV+SJGlgJmCSJEkDMwGTJEkamAmYJEnSwEzAJEmSBmYCJkmSNDATMEmSpIGZgEmSJA3MBEySJGlgJmCSJEkDMwGTJEkamAmYJEnSwEzAJEmSBmYCJkmSNDATMEmSpIGZgEmSJA3MBEySJGlgc07Aknw/yUVJzk+yvGu7e5KTk3y7+3ezrj1J3pXk8iQXJtltrvElSZIWmmn1gD2uqnapqqXd9dcBp1TVTsAp3XWAvYCdup8DgEOnFF+SJGnB6GsIch/gqO7yUcAzxto/XM1ZwN2SbNnTOUiSJK2XppGAFfCFJOckOaBru2dVXQnQ/btF174V8KOx+67o2iRJkjYYm0zhMR5RVVck2QI4Ocmlazg2E9rqFge1RO4AgG233XYKpyhJkrT+mHMPWFVd0f17NfApYHfgqtHQYvfv1d3hK4Btxu6+NXDFhMc8rKqWVtXSJUuWzPUUJUmS1itzSsCS3CnJXUaXgScDFwMnAPt1h+0HfLq7fALwgm415B7AL0dDlZIkSRuKuQ5B3hP4VJLRY32sqj6f5BvAsUleBPwQeHZ3/InA3sDlwPXA/nOML0mStODMKQGrqu8CD5nQ/jPgCRPaCzhwLjElSZIWOivhS5IkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDWzWCViSbZKcmuSSJN9M8squ/eAkP05yfvez99h9Xp/k8iSXJXnKNJ6AJEnSQrPJHO57I/C3VXVukrsA5yQ5ubvtHVX1H+MHJ9kZ2Bd4IHBv4ItJ7ltVN83hHCRJkhacWfeAVdWVVXVud/k64BJgqzXcZR/gmKq6oaq+B1wO7D7b+JIkSQvVVOaAJdke2BX4etf0iiQXJjkiyWZd21bAj8butoLVJGxJDkiyPMnya665ZhqnKEmStN6YcwKW5M7A8cCrqupXwKHAjsAuwJXA20aHTrh7TXrMqjqsqpZW1dIlS5bM9RQlSZLWK3NKwJLchpZ8HV1VnwSoqquq6qaquhn4ACuHGVcA24zdfWvgirnElyRJWojmsgoywOHAJVX19rH2LccOeyZwcXf5BGDfJLdLsgOwE3D2bONLkiQtVHNZBfkI4PnARUnO79reACxLsgttePH7wF8BVNU3kxwLfIu2gvJAV0BKkqQN0awTsKo6k8nzuk5cw33eDLx5tjElSZIWAyvhS5IkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDcwETJIkaWAmYJIkSQMzAZMkSRqYCZgkSdLATMAkSZIGZgImSZI0MBMwSZKkgZmASZIkDWzwBCzJnkkuS3J5ktcNHV+SJGm+DZqAJdkYeA+wF7AzsCzJzkOegyRJ0nwbugdsd+DyqvpuVf0OOAbYZ+BzkCRJmlepquGCJX8G7FlVL+6uPx94eFW9YsZxBwAHdFfvB1w2i3CbAz+dw+mur7GMZzzjbTjxFvNzM57xFmu87apqydoO2mQWDzwXmdB2iwywqg4DDptToGR5VS2dy2Osj7GMZzzjbTjxFvNzM57xNvR4Qw9BrgC2Gbu+NXDFwOcgSZI0r4ZOwL4B7JRkhyS3BfYFThj4HCRJkubVoEOQVXVjklcAJwEbA0dU1Td7CjenIcz1OJbxjGe8DSfeYn5uxjPeBh1v0En4kiRJshK+JEnS4EzAJEmSBmYCJkmSNLBFkYAleWqSRfFctDglud26tGntkvz7urRp3SXZIsm2o5/5Ph9pQ7BYkpZ9gW8neUuSB8z3yfQhyeYDxzsqyd3Grm+W5IieY94rydOTPC3JvXqK8aw1/fQRs/O1dWzT2j1pQttefQZM8ogkd+ouPy/J25Ns12fMIXSvt28D3wNOB74PfG6g2A8cIs58SPLIJPt3l5ck2WG+z2kakuw4+uKY5LFJ/mb8c2LKse6+pp8+Yg5t6Er4vaiq5yXZFFgGHJmkgCOBj1fVddOMleQiJlTvp1X5r6p68JTjbVRVNwNfAHbr2l5ZVe+cZpwJHlxVvxhdqaqfJ9m1r2BJXgz8M/Al2u/y3UkOqappJ31P6/7dAvjjLh7A44DTgE9OM1iXSG4F3KH7/Y12g9gUuOM0Y82I+wjgYGA72ut89Pd5nx5inQw8e/T3kmQz4JiqesqU47wMeDlwnyQXjt10F+Ar04w1waHAQ5I8BPh74HDgw8Bjph0oyVuANwG/AT4PPAR4VVV9dNqxgH8B9gC+WFW7Jnkc7X10CB+he0/rU5JnV9Vxa2ubYryDgKW0bfSOBG4DfBR4xJTjvJvJn0UAVNXfTDNe53hgaZI/oL0GTgA+BuzdQ6xzaM9vdTvoTP29DIZ9/S2KBAygqn6V5HjgDsCrgGcCr03yrqp69xRDPXWKj7UuTk/ya+BeSfYELgT2A/pOwDZKsllV/RzatxH6/Xt5LbBrVf2si3cP4KvAVBOwqhp9K/0ssHNVXdld3xJ4zzRjdZ4C/CVt14e3j7X/CnhDD/FGDgdeTXsTu6nHOACbT0jWt+ghzsdovTP/CrxurP26qrq2h3jjbqyqSrIP8M6qOjzJfj3FenJV/X2SZ9J2D3k2cCrtQ3za/qeqfpZko+7L3qkDDudO+mDtw+uBmcnWpLZpeSawK3AuQFVdkeQuPcRZ3sNjrs3NXT3PZwL/u6reneS8PgJV1Xz1Gg72+lsUCViSpwP7AzvSvlXtXlVXJ7kjcAkwtQSsqn4wI/am9Ph7rKpHdV285wC7Ay8G7pvkGOD0qjq0p9BvA76a5BO0bxvPAd7cUyxof+jjvZXXAT/qMd72o+SrcxVw32kHqaqjgKOS/GlVHT/tx1+DX1bVIENJwM1Jtq2qHwJ0Q3NTLzBYVb8EfgksS7Ib8MguzleAvhOw65K8Hng+8KgkG9N6Nvowety9ab341ya95Sq/SHJn4Azg6CRXAzf2FazrHRr1atwzyT+PbquqQ6Ycay/a73CrJO8au2lTenyOwO+6ZL2687hTH0G695ah/U+SZbROgNFoQl+vAwC61/pqVdW5Uw452OtvUSRgwJ8C76iqM8Ybq+r6JC/sI2CSvwIOoXVTjj5spt4tmuQLtLlCNwPv7noXzqMNgzx6mrHGVdWHkywHHk97s3xWVX2rr3jAj4GvJ/k07fe4D3B2ktd05/P2Nd15Fk5LchLw8S7evrRvOX35SpLDgXtX1V5Jdgb+qKoO7yneqUneShtSvWHU2MObFcA/AmcmOb27/mjggB7iAJDkn2hfCEbDxUcmOa6q3tRXTOC5wJ8DL6yqn6RNVH9rT7E+k+RS2nvLy5MsAX7bU6x9ujivBv4CuCvtfa0v3x+7/D/AD1Zz3DRcQeslejrtC+zIdbTn25djk7wfuFuSlwAvBD7QV7Akn2HNQ5FPn2K4/YGXAm+uqu91c9v66Jkd917aUPWFtM+iBwNfp/39FO0zapoGe/0t+Er43TfRk6rqiQPH/TbtA/SnPce5I/BHtD/y5cA9gT+gzd34clXNRzf01HXfjFerqt7YQ8xnAY/qrp5RVZ+adoyxWJ+jzQf5x6p6SJJNgPOq6g97ijcpmayqmvab1Sje5rS5RAG+1ufrIskltOHq33bX7wCcW1W9LsBJck/gYd3Vs6vq6h5jbQb8qqpu6t4DNq2qn/QQ59XAcVW1YtqPvQ6xz62qIeaA3YbW2bBtVV3Wd7wu5pOAJ9NeDydV1ck9xnoncC9WJkLLaInuSQBVdfrke8457mbANlV14VoPnlucY2gJ30Xd9QcBf1dVf9ljzEFefwu+B6z7BV2f5K7dEMVQvgNc33eQqroeOCXJT6rqafD7hQA/onUDL4oErI8Eax1ifpIpT7pfg82r6thuGGu0L2pvc7Oq6nF9PfZq4v0U+GySg6vqMz2H+z5we1Z+K70d7fXYmyTPofV4ncbKRSKvrapPTDHG46vqSxlbjTs29FFJrgXOrKpp/t1sCpzUPfYxwCeq6qopPv6aDDUHbE/gP4DbAjsk2QU4ZMo9Q6voEq7ekq4Zdq2q8dGQzyQ5o6qmPsc0yWm0HsVNgPOBa5KcXlWvmXasMfcfJV8AVXVx9384VZmwCn7G0OPUPysWfALW+S1wUdpqrF+PGntaBTLyetocqa+z6hBPXzH/dOzymd0b/9Te/OdbkvsCfwdsz9jf5bR7bJKcWVWPTHIdq3bbj1YJbjrNeGN+3S0sGM0L2YM2n6kXSe4KHMTKYerTaR86fX9JeTpt9WWfbgC+2b3ei1aW4szRPJ+eXoP/CDxs1OvVDUt8kem+Bh9DW5X7tNXcfg/gfzG5DMesdF983pjkwbRh1tOTrBhoROEJA8SA9ve4Oy15pqrOT7J9X8G6D/J/p620Dv2/tyxJcp+q+m4X/z7Akp5i3bVb8PZi4MiqOiirrkjuwyVJPkjr4SvgebS53dO2utcdXVwTsNX4r+5nSO+nvVleRJuf1beD0spP/KKqXtZ1kb6tqnqZ4zYPjgPeB3yQHlftVdUju3/7WJW0Jq+hLdneMclXaG+Qf9ZjvCOAi2lzpaBNHj8S6LPWGQzTq/Gp7mfktAFibjRjyPFnTLmOYvdhthHwuao6dtIx3TzCPlwN/IT2vPpYwXoLA6xcHbmxqn7Z40KGmd4CPK2q+kgSJnkVbU7rd2mJwg70Nwdzk7QV48+hfSkZwv7Ay4BXdtfPoJWFmarRCvkhLYoErKqO6uaBDDbGT3tR99ntOtOgdbnmwY09ruicKK2m0/gcsN6+yVXVuUkeQ6sNFOCyqvqfvuIBO1bVeK/pG5Oc32O8kYf2HWCeVn99fmzRBrTeohOnHaSqbk7yCmBiAlZVL5pmvLTaas+lfSH4BPCSnhfbjMc+fsbfaF8uTvLnwMZJdgL+hlbipi9XDZh8QRtGfhAt8Xo6rb5hX3Mw30ibW3ZmVX2j6237dk+xAOjmer4DeEdaOaStR/M/pynJ86rqo6OFXxPOY9oLwRZHJfwkT6ONR3++u75LkhN6DntqkgOSbJlhqvNu1PV6AYPU5RrE2O/tM0kOHOr3meSVwNG0b/tb0Jbg/3Vf8Tq704r67UYrpfCCHmP9JskjR1fSCrP+po9ASe6b5JQkF3cJxIOT/K8e4hzb/XtRkgtn/kw73riqei1wGG0F1kOAw6rqH3oKd3KSv0uyzQCvhe1oRSYfWFUHDZV8dXoppDnBXwMPpA1df5xWg+9VPcZbnuQ/kyzLMLts/FNV/YpWkPhJtJGEvr7MPg14TFW9vLv+c3qcSgFt3lmSTbvXwPm0Vc9TT4aAUbmQu6zmZ+oW/CpIgCTn0JainlZVu3ZtF/W1wqx7/O9NaK7qodJ4F+8FtHlnq9TlqqqP9BFvKN3vcbza8Sp/kD3+Pi+krWL9dXf9TrTVe1PdyWAs3kdoderOZ+UQa/U1Z7Dr3fswraxAaHWy/rKqLugh1um0QrrvH3v9XVxVD5pynC2r6sqsZgugmlGjb6Ea+r2li7kFbWHDKNgPe4oz2mcytGkje3WXe4s5tCRHTmiuvqaLJDmv2i4G/wpcVFUfG7X1FWttbX3E7OadbTOad9bXe/WQFnwPSmfSGH/fmeUDZnaDJrn96g6eqxq+Ltcgqqt23A0hv5yVxTW/TPsm15ew6lyzm+h3/tJSWuX9Qb7xdInWQ9IKBdN9Q+7LHavq7Bmvv6kXuuySr42BwweaJD4vizZqwArg3ejB24F70+aBbUeb4NzXPo1HsfIL13bd9dBPPSdguAU+Y4879FyiH6fVHXsi8O9pezX2Nbo19A4pMPC8s+5z/EW018D4l5KpJ9CLJQEbeoyf7vFn1rCZ1DY1XcK14JOu1TiKNjQwqli9rGt7zmrvMTdH0gq/jiZzP4O2fU9fLqbV6rlybQdOQ/cm/Kd0Hzqj5KimXG2889MkO7Jyheef0dPzrIHLzszXoo20Wkc7s+oHwId7CPUmBtwLssbKo3Q9G70kQTMMssBnJMnWtN1XHkF7TZwJvLL6q7X2HLpSG1X1iy5ZeW1PsYbeIQVaYeAh5519BLiUto3cIbQCxb3M6VssQ5B3pGXGvy98B/xLTxP1Rpsrf5T2HzOyKfC+qrr/tGNuCJJcUFUPWVvblGOOtrMJbRJ+L3uadbFOBXYBzmbVsiW91CJK8nna3IxV9oKsqrf1EOs+tPlRf0ybE/I94C/6GhLs5oLtQauzNFTZmcGkFSV+LC0BO5E2THdmVU191WyS5VW1NMkFtHpSNyc5u6p2n3asCbF7Hboai3NOVfW+OGQs3sm0fUtH00OeR3s9TK18yHxK28VjNBJzSt8jMUnuPuCK2fEhzwur6sFphXxP6uPLwqLoAatWrPQfGWZZ7Pjmyv8x1n4dbY6WZue8JHtU1VkASR5O2+Nv6tKW+l/YzVHqY2ueSQ4eKM7I1lUIS2zuAAAOVElEQVS150CxflBVT+zm0W1UVdet9R5zMx9lZ4b0Z7SJ/udV1f5pFfg/2FOs0V6QX2aAvSBneOdAcT6T5OW00iXjX376+lBfUlXj88A+lKTPSf+DmoeRmK+nreA+klaipe9eo9Hq9F90PdE/oY0kTN1i6QE7lQlzvvrs3k7yvC7m9qxMZKunIZ5FL217mfsBo4m429K6fW+m/V6nOuEyydHA6xfLxN+ZkhxG2zv0orUePPdYP6StQP5P4Et9v0F2id5vq6sI380Lu133RWzBG/VAdYuLHkf7cndxVU19XlY3evBbWm/G82g9+Uf32eOQ5CjakNwvuuu91jSchwVTXwQ+xMqSJcuA/atqqMKzi0ra/Ikn0vbU3J32PvOhqvp/PcV7MXA88Ie0/8c701aavn/asRZFDxhtguXI7WlzX/r+Fvd82nDLufS3Ue6GZKjempEtadXUz2bVYazetieZKclhVTXVgolp21QV7bW9f1pxxhtYOWm8j5VD96MtTz8QODzJZ4FjqurMHmIBnEJ7Q/7v7vodgC/QhkAXg+VJ7kbbwPkc2vM8e5oBRosLgKtY+eV1tIriTWlbE721qt47zbidQWsaTlrU0M2T6ssLgf9Dq11VtLnBi6Vg9uC6L3Qn08qzPI42/efl3bD566rqa9OK1Y2O/KpbZHAGPZdKWRQ9YJOk7U/1mB4ff+rL7DWctKKot1A9bVy7mnN4aFWdM+XHnFiiYaTvUg1db8Y7aXNeNu4pxvlVtcva2haDtC1zNq2eNzyeEPcewFer6n49PPYFwGNnrKQ7vXosGzThHD5bVU8dKp5mr/tbfB7wAtpw4OG0XUV2oW0kP9VVw2n7aD567UfO3aLoAcuqRQo3olXjvlfPYb+a5A+HGOLR9A2ZaI3rykJUVV037eQLViZYST5SVc+fEfsjtJ7bqesS2ufSJox/g/5Wr0LbV3O3qjq3i/1QeioyO6RuUchqbxs93yFU1c+SPLanh5+PlXSr6DP5Stsn9CXcsuyFvWCz8zXagoanV9WPx9qXJ+mjVNHJSf6ONtQ5Pjoy9WH5RdEDllWLed5IW4V1SB9DIDOGeHYChhji0ZRMqOe0ij7qOnVxl9Imkd6F9rfyC+CFfSRhXbxzq2q3sesb04o07txDrO/RCsweC5xQXXHbviR5GHAMcEXXtCXw3L5+l0Pp5rKuTg1UsmEQQ6+kG1KSr9IWNcxcgXz8vJ3UAta93t9Aqxs3ntD2VTR7sDmDiyIBG9J8D/FoOpIcQuvO/gjtQ+AvgLtU1Vt6inchcGBVfbm7/kjgvT0sLng97c3qDsBoUnqA39G2z5n6St0km1a/hV4nxbwNK/fVvLT63VdTWmeLdTh8viS5jDbP+2LaoixgcXzWLooELGvZZ6uqPjnUuWhhSPL1qnr42tqmGO8rVfWItbVNMd6/9pFszYjx91X1liTvZvIq5L62WXo28Pmqui5tz8ndgDcNOUTXp6xmj9DqpxCrpizJm2jz56a+WfuGaGzByHzEnvpCqXGLYg4YbduAPwa+1F1/HHAarRBlASZgmummJH9BG8oq2lLxPqtkn522XcjHu3jPBU4bzfvpIXm4b5K9aYnKzWs9enZG1aGX9/T4q/NPVXVc14v4FFo9vkOBXpLnefCwscu3B55AW21tArYeG5veEOANSX7HyppS1df0hg3AQUk+SFv9PF7HbYjP9aV9PvhiScCKts/elfD7JcbvqeH35NLC8ee01XrvpP39fKVr68toSOKgGe1/TD/74B0K7A+8O8lxtLo5l04zQFV9prt4fVUdN35b10vVl1Gi/CfAoVX16SQH9xhvUFX11+PXk9yVlVXVtZ6qgber2oDsD9wfuA0rhyCH6li5us8HXyxDkKuUhJhR6VzaYHUf3stou0T8iFZb6qPTnDM1c8L/6tqmGO+zwI9ptcBGKyDPrh63rZpP3Xy3C6vqAfN9Llo33bSYR9IShS9X1f+d51NasJJcNGSJkiEtlh6w05KcxMrhnWXAmlYUaQM39FLxJK+krYK8jpYE7UYrIviFPuJ1Me9BKzvxPOA84Gjah8J+tL0G5/r4ewF7A1sledfYTZvSbyHkITcfHlySz7ByTt1GtD0hj52/M9KtkeS9wB+wshL+S5M8qaoOnMfTWsjOSrLzUCtlk9yX9n4yc9Xl1FchL4oeMIAkzwQe1V09w28cWpOhl4qn21g8yVNoFeP/CTiyx16iT9K67T/SxfnJ2G3Lq2rOcxuSPIQ2tHoI8M9jN10HnDoqtNmXJFvQ5kgBUItkW6kZRYJvpO21uWK+zke3TpJvAg/qKriPRmQuqh62ktoQpG1TtyOtvFTvJZ+6QsHv45afDVMvc7Oge8BGqyNmTH4EeEmSm4E+t9PQwnbHqvqHAeON/jb3piVEFyTJmu4wRx+k9Zw8Alia5EzafKnfTiP5AqiqC4ALknwK+HXN2JtxGjEmSfJ0WjHPe9PmaGwLXAosig+4+SoSrKm5jPY3OSqTsA0w6E4Gi8zQ29TdWFWHDhFo0fSATdLndhpa2IZeKp7kSGArYAfgIcDGwGlV9dCe4h0L/Io27AhtWH6zqpr65PgkZwFPrKr/7q7fGfhCVfWyN2P3DfXxwBerate0/eGW9blcfEirKRb8S9pq07+tqu8Of1ZaV0lOp61kHe3f+TBaNffrYdj9ZnXrdQt6rgY+xaqrLq2Ef2sl2XK0OlIa6T7k7kR7gf0PK7u1+6qEvxFtuO42tN6hzYGtqurdPcW7YOak9EltU4o16N6MoyHULhHbtapuTnJ2Ve3eR7yhJXkjrcr/x2h/l/vStla7DHhZVT12/s5Oa5PV7DM7Yg/n+m3ISvgLeghyXZh8aZKqukvaHqI7MTaPqEcvBF4JbE3btmcP2rfiXhIw4Lwke1TVWQBJHk4rtdGHofdm/EXXy3YGcHSSq+l30v/Q9pxREPiwJGdV1SFJ3jBvZ6V1YoK1sNWUN/dek0WfgEmTJHkxt0yIvkoretmHV9KGIs6qqscluT/wxp5iQStK+oIko4np2wKXjPYynfIE1lcBxyVZZW/GKT7+TPvQErxX07aQuittIcBicXOS5wCf6K7/2dhti3vIYgGbMCf59zdhIdYFoyv78jLg0V3TacD7+9jubNEPQUqTdInIKCHaZZQQVVUviUOSb1TVw5KcDzy8qm7oeZhu0D1Lh9ybMcmrgeMW68rAJPehFQj+I9oH+Vm0ZPPHwEOr6sx5PD1pUeuq7t8GOKprej5wU1W9eNqx7AHThuq3VfXbJCS5XVVdmqTPxRorktwN+L/AyUl+Tpvn04tpJ1hrkuSOwGuA7arqJUl2SnK/qvpsTyE3BU5Kci1tK6lPVNVVPcUaXDfJ/mmrudnkS+rXw2bMlf1SN9906kzAtKEaOiF6Znfx4CSn0obNPt9XvIEdSauZ80fd9RXAcUAvCVhVvRF4Y5IH04Y6T0+yoqqe2Ee8oQ1dJFjSKm5KsmNVfQd+3yPdyz7BJmDaIM1nQrQIJ+nuWFXPTbIMoKp+03ONs5GrgZ8APwO2GCDeUD5NKxL8RfrdIF7SLb0WODXJd2lTKraj7Uc5dSZg2uAtwoRoaL9Lcge6icdJdmSsfs60JXkZredrCW2i+kuG2qZkIEMXCZbUqapTkuzEqnNae3k/MwGTNFcH0XoPt0lyNK36/l/2GG874FVVdX6PMebTZ5PsPVSRYEmr6hKuC5Mc1meBZ1dBSpqzbteJPWjfGM+qqp8OEHOx7gU5aJFgSZMlObev/XrBHjBJs5Tk/t3q0dEb1Kjo8bZJtgGu7WM1ZpKnAW9n5V6Q2wGXsHj2ghy6SLCkya7u88HtAZM0K6Pu+W4RwyT3AC6oqudPOe5i3wtyYpHgquqrSLCkTpIHVdXFg8QyAZPUlyRfqKonT/kxF/tekIMWCZa0UpIzgdsCHwI+VlW/6CuWQ5CS5iTJ7YGXA4+krYT8MvC+qvrttJOvzmgvyC+zOPeCHLpIsKROt53UTrT9e5cnORs4sqpOnnYse8AkzUmSY4HrgI92TcuAzarq2T3FuyPwW9rk9OfRKuMfXVXX9hFvaEk+Ras79CraUOvPgdtU1d7zemLSBiTJxsAzgHcBv6K937yhqj45tRgmYJLmIskFM7bumNg2hTiTNjseFXy9GbgWeGtVvXeacedTksfQFQmuqt/N9/lIi123w8b+wJ8AJwOHV9W5Se4NfK2q1rjP7q2KZQImaS6SfIg25HhWd/3hwH5V9fKBz+MetMnqDtdJmpUkZwAfoO0x+5sZtz2/qj4ytVgmYJJmo5ssXsBtaFWjf9hd3w74VlU9aB7OacuqunLtR0rSZEluC9yf9n52WV+9zyZgkmYlyXhX/GbAo7rLZwC/6KMGmCT1KcnewPuB79CmOOwA/FVVfW7qsUzAJM1FklcCLwY+SXvDegbwgap697yemCTdSkkuBZ5aVZd313cE/quq7j/1WCZgkuYiyYXAH1XVr7vrd6JNVn3w/J6ZJN06Sc6oqkePXQ9w+njbtFgHTNJcBbhp7PpNrFydKEnrvSTP6i5+M8mJwLG0OWDPBr7RR0wTMElzdSTw9a5+FbQhyMPn8Xwk6dZ62tjlq4DHdJevoc1xnTqHICXNWbch9yNpPV9nVNV583xKkrReMwGTJEkCkhzJykLPv1dVL5x2LIcgJUmSms+OXb498Ezgij4C2QMmSZI0QZKNgC9W1eOn/dgbTfsBJUmSFomdgG37eGCHICVJ0gavq/l1E/DfY80/Af6hj3gmYJIkaYNXVZXk/KrabYh4DkFKkiQ1X03ysCECOQlfkiQJSPIt4H7A94Ff02obVh9bq5mASZIkAUm2m9ReVT+YeiwTMEmSpGE5B0ySJGlgJmCSJEkDMwGTJEkamAmYJEnSwP4/vvvmWpsWSzoAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x22345a7fb00>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']\n",
    "plt.figure(figsize=(10,4))\n",
    "df.tags.value_counts().plot(kind='bar');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The classes are very well balanced."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_plot(index):\n",
    "    example = df[df.index == index][['post', 'tags']].values[0]\n",
    "    if len(example) > 0:\n",
    "        print(example[0])\n",
    "        print('Tag:', example[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Have a look a few post and tag pairs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "when we need interface c# <blockquote>    <strong>possible duplicate:</strong><br>   <a href= https://stackoverflow.com/questions/240152/why-would-i-want-to-use-interfaces >why would i want to use interfaces </a>   <a href= https://stackoverflow.com/questions/9451868/why-i-need-interface >why i need interface </a>    </blockquote>     i want to know where and when to use it     for example    <pre><code>interface idemo {  // function prototype  public void show(); }  // first class using the interface class myclass1 : idemo {  public void show()  {   // function body comes here   response.write( i m in myclass );  }  }  // second class using the interface class myclass2 : idemo {  public void show()   {   // function body comes here   response.write( i m in myclass2 );   response.write( so  what  );  } </code></pre>   these two classes has the same function name with different body. this can be even achieved without interface. then why we need an interface where and when to use it\n",
      "Tag: c#\n"
     ]
    }
   ],
   "source": [
    "print_plot(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "how to chain expressions inside ngclass when using the {...}[] form  how can i add another expression to an <code>ng-class</code> directive that uses this form:   <pre><code>ng-class= {true: loading   false: loading-done }[data.loader===null]  </code></pre>   i d like to add something like this to the list:   <pre><code>{highlight:isspecial} </code></pre>   is it possible without expanding the first expression     thanks.\n",
      "Tag: angularjs\n"
     ]
    }
   ],
   "source": [
    "print_plot(30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The text need to be cleaned up."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\n",
    "BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\n",
    "STOPWORDS = set(stopwords.words('english'))\n",
    "\n",
    "def clean_text(text):\n",
    "    \"\"\"\n",
    "        text: a string\n",
    "        \n",
    "        return: modified initial string\n",
    "    \"\"\"\n",
    "    text = BeautifulSoup(text, \"lxml\").text # HTML decoding\n",
    "    text = text.lower() # lowercase text\n",
    "    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text\n",
    "    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text\n",
    "    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['post'] = df['post'].apply(clean_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "need interface c# possible duplicate would want use interfaces need interface want know use example interface idemo function prototype public void show first class using interface class myclass1 idemo public void show function body comes responsewrite myclass second class using interface class myclass2 idemo public void show function body comes responsewrite myclass2 responsewrite two classes function name different body even achieved without interface need interface use\n",
      "Tag: c#\n"
     ]
    }
   ],
   "source": [
    "print_plot(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Way better!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3421180"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['post'].apply(lambda x: len(x.split(' '))).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we have over 3 million words to work with."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df.post\n",
    "y = df.tags\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The next steps includes feature engineering. We will convert our text documents to a matrix of token counts (CountVectorizer), then transform a count matrix to a normalized tf-idf representation (tf-idf transformer). After that, we train several classifiers. \n",
    "\n",
    "### Naive Bayes classifier for multinomial models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "\n",
    "nb = Pipeline([('vect', CountVectorizer()),\n",
    "               ('tfidf', TfidfTransformer()),\n",
    "               ('clf', MultinomialNB()),\n",
    "              ])\n",
    "nb.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 0.7395\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "         java       0.63      0.65      0.64       613\n",
      "         html       0.94      0.86      0.90       620\n",
      "      asp.net       0.87      0.92      0.90       587\n",
      "           c#       0.70      0.77      0.73       586\n",
      "ruby-on-rails       0.73      0.87      0.79       599\n",
      "       jquery       0.72      0.51      0.60       589\n",
      "        mysql       0.77      0.74      0.75       594\n",
      "          php       0.69      0.89      0.78       610\n",
      "          ios       0.63      0.59      0.61       617\n",
      "   javascript       0.57      0.65      0.61       587\n",
      "       python       0.70      0.50      0.59       611\n",
      "            c       0.79      0.79      0.79       594\n",
      "          css       0.84      0.59      0.69       619\n",
      "      android       0.66      0.84      0.74       574\n",
      "       iphone       0.64      0.83      0.72       584\n",
      "          sql       0.66      0.64      0.65       578\n",
      "  objective-c       0.79      0.77      0.78       591\n",
      "          c++       0.89      0.83      0.86       608\n",
      "    angularjs       0.94      0.89      0.91       638\n",
      "         .net       0.74      0.66      0.70       601\n",
      "\n",
      "  avg / total       0.75      0.74      0.74     12000\n",
      "\n",
      "Wall time: 880 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from sklearn.metrics import classification_report\n",
    "y_pred = nb.predict(X_test)\n",
    "\n",
    "print('accuracy %s' % accuracy_score(y_pred, y_test))\n",
    "print(classification_report(y_test, y_pred,target_names=my_tags))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Linear support vector machine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,\n",
       "       tol=None, verbose=0, warm_start=False))])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "sgd = Pipeline([('vect', CountVectorizer()),\n",
    "                ('tfidf', TfidfTransformer()),\n",
    "                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),\n",
    "               ])\n",
    "sgd.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 0.7891666666666667\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "         java       0.74      0.68      0.71       613\n",
      "         html       0.85      0.93      0.89       620\n",
      "      asp.net       0.87      0.95      0.91       587\n",
      "           c#       0.81      0.80      0.80       586\n",
      "ruby-on-rails       0.74      0.88      0.80       599\n",
      "       jquery       0.77      0.41      0.53       589\n",
      "        mysql       0.82      0.68      0.74       594\n",
      "          php       0.70      0.95      0.81       610\n",
      "          ios       0.82      0.56      0.66       617\n",
      "   javascript       0.72      0.59      0.65       587\n",
      "       python       0.71      0.65      0.68       611\n",
      "            c       0.81      0.87      0.84       594\n",
      "          css       0.77      0.79      0.78       619\n",
      "      android       0.83      0.86      0.85       574\n",
      "       iphone       0.81      0.80      0.81       584\n",
      "          sql       0.71      0.68      0.69       578\n",
      "  objective-c       0.81      0.90      0.85       591\n",
      "          c++       0.84      0.96      0.89       608\n",
      "    angularjs       0.87      0.95      0.91       638\n",
      "         .net       0.77      0.89      0.83       601\n",
      "\n",
      "  avg / total       0.79      0.79      0.78     12000\n",
      "\n",
      "Wall time: 940 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "y_pred = sgd.predict(X_test)\n",
    "\n",
    "print('accuracy %s' % accuracy_score(y_pred, y_test))\n",
    "print(classification_report(y_test, y_pred,target_names=my_tags))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip...ty='l2', random_state=None,\n",
       "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "logreg = Pipeline([('vect', CountVectorizer()),\n",
    "                ('tfidf', TfidfTransformer()),\n",
    "                ('clf', LogisticRegression(n_jobs=1, C=1e5)),\n",
    "               ])\n",
    "logreg.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 0.783\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "         java       0.70      0.62      0.66       613\n",
      "         html       0.91      0.91      0.91       620\n",
      "      asp.net       0.97      0.94      0.95       587\n",
      "           c#       0.78      0.77      0.78       586\n",
      "ruby-on-rails       0.77      0.81      0.79       599\n",
      "       jquery       0.59      0.58      0.58       589\n",
      "        mysql       0.77      0.76      0.76       594\n",
      "          php       0.82      0.86      0.84       610\n",
      "          ios       0.70      0.72      0.71       617\n",
      "   javascript       0.61      0.59      0.60       587\n",
      "       python       0.64      0.63      0.64       611\n",
      "            c       0.83      0.83      0.83       594\n",
      "          css       0.78      0.78      0.78       619\n",
      "      android       0.85      0.85      0.85       574\n",
      "       iphone       0.80      0.83      0.81       584\n",
      "          sql       0.65      0.65      0.65       578\n",
      "  objective-c       0.82      0.84      0.83       591\n",
      "          c++       0.91      0.91      0.91       608\n",
      "    angularjs       0.96      0.94      0.95       638\n",
      "         .net       0.78      0.83      0.80       601\n",
      "\n",
      "  avg / total       0.78      0.78      0.78     12000\n",
      "\n",
      "Wall time: 883 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "y_pred = logreg.predict(X_test)\n",
    "\n",
    "print('accuracy %s' % accuracy_score(y_pred, y_test))\n",
    "print(classification_report(y_test, y_pred,target_names=my_tags))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word2vec embedding and Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 2min 11s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from gensim.models import Word2Vec\n",
    "\n",
    "wv = gensim.models.KeyedVectors.load_word2vec_format(\"GoogleNews-vectors-negative300.bin.gz\", binary=True)\n",
    "wv.init_sims(replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Memorial_Hospital',\n",
       " 'Seniors',\n",
       " 'memorandum',\n",
       " 'elephant',\n",
       " 'Trump',\n",
       " 'Census',\n",
       " 'pilgrims',\n",
       " 'De',\n",
       " 'Dogs',\n",
       " '###-####_ext',\n",
       " 'chaotic',\n",
       " 'forgive',\n",
       " 'scholar',\n",
       " 'Lottery',\n",
       " 'decreasing',\n",
       " 'Supervisor',\n",
       " 'fundamentally',\n",
       " 'Fitness',\n",
       " 'abundance',\n",
       " 'Hold']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from itertools import islice\n",
    "list(islice(wv.vocab, 13030, 13050))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The common way is to average the two word vectors. BOW based approaches which includes averaging."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def word_averaging(wv, words):\n",
    "    all_words, mean = set(), []\n",
    "    \n",
    "    for word in words:\n",
    "        if isinstance(word, np.ndarray):\n",
    "            mean.append(word)\n",
    "        elif word in wv.vocab:\n",
    "            mean.append(wv.syn0norm[wv.vocab[word].index])\n",
    "            all_words.add(wv.vocab[word].index)\n",
    "\n",
    "    if not mean:\n",
    "        logging.warning(\"cannot compute similarity with no input %s\", words)\n",
    "        # FIXME: remove these examples in pre-processing\n",
    "        return np.zeros(wv.vector_size,)\n",
    "\n",
    "    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)\n",
    "    return mean\n",
    "\n",
    "def  word_averaging_list(wv, text_list):\n",
    "    return np.vstack([word_averaging(wv, post) for post in text_list ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def w2v_tokenize_text(text):\n",
    "    tokens = []\n",
    "    for sent in nltk.sent_tokenize(text, language='english'):\n",
    "        for word in nltk.word_tokenize(sent, language='english'):\n",
    "            if len(word) < 2:\n",
    "                continue\n",
    "            tokens.append(word)\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(df, test_size=0.3, random_state = 42)\n",
    "\n",
    "test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values\n",
    "train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `syn0norm` (Attribute will be removed in 4.0.0, use self.wv.vectors_norm instead).\n",
      "  \n",
      "WARNING:root:cannot compute similarity with no input []\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 15.8 s\n"
     ]
    }
   ],
   "source": [
    "X_train_word_average = word_averaging_list(wv,train_tokenized)\n",
    "X_test_word_average = word_averaging_list(wv,test_tokenized)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 1min 7s\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "logreg = LogisticRegression(n_jobs=1, C=1e5)\n",
    "logreg = logreg.fit(X_train_word_average, train['tags'])\n",
    "y_pred = logreg.predict(X_test_word_average)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 0.6379166666666667\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "         java       0.63      0.59      0.61       613\n",
      "         html       0.73      0.76      0.75       620\n",
      "      asp.net       0.65      0.67      0.66       587\n",
      "           c#       0.53      0.52      0.52       586\n",
      "ruby-on-rails       0.70      0.77      0.73       599\n",
      "       jquery       0.44      0.39      0.41       589\n",
      "        mysql       0.65      0.61      0.63       594\n",
      "          php       0.73      0.80      0.76       610\n",
      "          ios       0.60      0.61      0.61       617\n",
      "   javascript       0.56      0.52      0.54       587\n",
      "       python       0.55      0.50      0.52       611\n",
      "            c       0.61      0.61      0.61       594\n",
      "          css       0.65      0.65      0.65       619\n",
      "      android       0.60      0.57      0.59       574\n",
      "       iphone       0.70      0.71      0.71       584\n",
      "          sql       0.42      0.42      0.42       578\n",
      "  objective-c       0.68      0.71      0.70       591\n",
      "          c++       0.76      0.78      0.77       608\n",
      "    angularjs       0.82      0.83      0.82       638\n",
      "         .net       0.66      0.71      0.68       601\n",
      "\n",
      "  avg / total       0.63      0.64      0.64     12000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('accuracy %s' % accuracy_score(y_pred, test.tags))\n",
    "print(classification_report(test.tags, y_pred,target_names=my_tags))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Doc2vec and Logistic Regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Doc2vec, taking the linear combination of every term in the document creates a random walk with bias process in the word2vec space."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "tqdm.pandas(desc=\"progress-bar\")\n",
    "from gensim.models import Doc2Vec\n",
    "from sklearn import utils\n",
    "import gensim\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "def label_sentences(corpus, label_type):\n",
    "    \"\"\"\n",
    "    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.\n",
    "    We do this by using the TaggedDocument method. The format will be \"TRAIN_i\" or \"TEST_i\" where \"i\" is\n",
    "    a dummy index of the post.\n",
    "    \"\"\"\n",
    "    labeled = []\n",
    "    for i, v in enumerate(corpus):\n",
    "        label = label_type + '_' + str(i)\n",
    "        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))\n",
    "    return labeled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df.post, df.tags, random_state=0, test_size=0.3)\n",
    "X_train = label_sentences(X_train, 'Train')\n",
    "X_test = label_sentences(X_test, 'Test')\n",
    "all_data = X_train + X_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[TaggedDocument(words=['fulltext', 'search', 'php', 'pdo', 'returning', 'result', 'searched', 'lot', 'matter', 'find', 'wrong', 'setup', 'trying', 'fulltext', 'search', 'using', 'pdo', 'php', 'get', 'results', 'error', 'messages', 'table', 'contains', 'customer', 'details', 'id', 'int', '11', 'auto_increment', 'name', 'varchar', '150', 'lastname', 'varchar', '150', 'company', 'varchar', '250', 'adress', 'varchar', '150', 'postcode', 'int', '5', 'city', 'varchar', '150', 'email', 'varchar', '250', 'phone', 'varchar', '20', 'orgnr', 'varchar', '15', 'timestamp', 'timestamp', 'current_timestamp', 'run', 'sqlquery', 'alter', 'table', 'system_customer', 'add', 'fulltext', 'name', 'lastname', 'except', 'columns', 'id', 'postcode', 'timestamp', 'signs', 'trouble', 'far', 'idea', 'problem', 'lies', 'db', 'configuration', 'php', 'code', 'goes', 'php', 'sth', 'dbhprepare', 'select', 'name', 'lastname', 'company', 'adress', 'city', 'phone', 'email', 'orgnr', 'db_pre', 'customer', 'match', 'name', 'lastname', 'company', 'adress', 'city', 'phone', 'email', 'orgnr', 'search', 'boolean', 'mode', 'bind', 'placeholders', 'sthbindparam', 'search', 'data', 'sthexecute', 'rows', 'sthfetchall', 'testing', 'print_r', 'dbherrorinfo', 'empty', 'rows', 'echo', 'else', 'echo', 'foreach', 'rows', 'row', 'echo', 'tr', 'datahref', 'new_orderphp', 'cid', 'row', 'id', 'echo', 'td', 'row', 'name', 'td', 'echo', 'td', 'row', 'lastname', 'td', 'echo', 'td', 'row', 'company', 'td', 'echo', 'td', 'row', 'phone', 'td', 'echo', 'td', 'row', 'email', 'td', 'echo', 'td', 'date', 'ymd', 'strtotime', 'row', 'timestamp', 'td', 'echo', 'tr', 'echo', 'tried', 'change', 'parameter', 'searchquery', 'string', 'like', 'testcompany', 'somename', 'boolean', 'mode', 'also', 'read', 'word', 'found', '50', 'rows', 'counts', 'common', 'word', 'pretty', 'sure', 'case', 'uses', 'specific', 'words', 'table', 'uses', 'myisam', 'engine', 'get', 'results', 'error', 'messages', 'please', 'help', 'point', 'wrong', 'thank'], tags=['Train_0']),\n",
       " TaggedDocument(words=['select', 'everything', '1', 'table', 'x', 'rows', 'another', 'im', 'making', 'join', 'query', 'like', 'select', 'clothes', 'c', 'join', 'style', 'cstyleid', 'ssylelid', 'clothesid', '19', 'dont', 'want', 'select', 'everything', 'style', 'want', 'select', 'everything', 'clothes', '20', 'rows', 'select', '1', 'row', '10', 'style', 'easyest', 'way', 'without', 'select', 'every', 'row', 'clothes', '20', 'things', 'select', 'like', 'select', 'cid', 'cdescription', 'cname', 'csize', 'cbrand', 'sname', 'clothes', 'c', 'join', 'style', 'cstyleid', 'stsylelid', 'clothesid', '19', 'would', 'fastest', 'way', 'possibillity'], tags=['Train_1'])]"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 40000/40000 [00:00<00:00, 2559297.07it/s]\n"
     ]
    }
   ],
   "source": [
    "model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)\n",
    "model_dbow.build_vocab([x for x in tqdm(all_data)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 40000/40000 [00:00<00:00, 2560195.33it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2346627.88it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1163541.14it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1393769.03it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2526879.43it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1688749.13it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1769393.90it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1734509.44it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1266463.05it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 5424078.11it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1078213.39it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2491641.07it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1662262.56it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 917093.46it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1036219.09it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2279574.99it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1734527.37it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1375643.95it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1036219.09it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1022926.13it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1092985.36it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2417884.36it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1477596.00it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2156315.92it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1266405.69it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 831221.87it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 6649709.08it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 2559297.07it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 876772.44it/s]\n",
      "100%|██████████| 40000/40000 [00:00<00:00, 1806079.68it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 3min 36s\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(30):\n",
    "    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)\n",
    "    model_dbow.alpha -= 0.002\n",
    "    model_dbow.min_alpha = model_dbow.alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_vectors(model, corpus_size, vectors_size, vectors_type):\n",
    "    \"\"\"\n",
    "    Get vectors from trained doc2vec model\n",
    "    :param doc2vec_model: Trained Doc2Vec model\n",
    "    :param corpus_size: Size of the data\n",
    "    :param vectors_size: Size of the embedding vectors\n",
    "    :param vectors_type: Training or Testing vectors\n",
    "    :return: list of vectors\n",
    "    \"\"\"\n",
    "    vectors = np.zeros((corpus_size, vectors_size))\n",
    "    for i in range(0, corpus_size):\n",
    "        prefix = vectors_type + '_' + str(i)\n",
    "        vectors[i] = model.docvecs[prefix]\n",
    "    return vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')\n",
    "test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
       "          fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
       "          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "logreg = LogisticRegression(n_jobs=1, C=1e5)\n",
    "logreg.fit(train_vectors_dbow, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "logreg = logreg.fit(train_vectors_dbow, y_train)\n",
    "y_pred = logreg.predict(test_vectors_dbow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy 0.8045\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "         java       0.73      0.68      0.70       589\n",
      "         html       0.89      0.91      0.90       661\n",
      "      asp.net       0.93      0.94      0.94       606\n",
      "           c#       0.80      0.80      0.80       613\n",
      "ruby-on-rails       0.83      0.90      0.86       601\n",
      "       jquery       0.72      0.71      0.72       585\n",
      "        mysql       0.87      0.81      0.84       621\n",
      "          php       0.81      0.84      0.82       587\n",
      "          ios       0.68      0.67      0.67       560\n",
      "   javascript       0.69      0.63      0.66       611\n",
      "       python       0.63      0.65      0.64       593\n",
      "            c       0.81      0.83      0.82       581\n",
      "          css       0.81      0.77      0.79       608\n",
      "      android       0.84      0.85      0.84       593\n",
      "       iphone       0.84      0.82      0.83       592\n",
      "          sql       0.68      0.65      0.66       597\n",
      "  objective-c       0.84      0.86      0.85       604\n",
      "          c++       0.90      0.95      0.92       610\n",
      "    angularjs       0.93      0.96      0.95       595\n",
      "         .net       0.81      0.84      0.82       593\n",
      "\n",
      "  avg / total       0.80      0.80      0.80     12000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('accuracy %s' % accuracy_score(y_pred, y_test))\n",
    "print(classification_report(y_test, y_pred,target_names=my_tags))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### BOW with keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import os\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "\n",
    "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "from tensorflow import keras\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation, Dropout\n",
    "from keras.preprocessing import text, sequence\n",
    "from keras import utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 28000\n",
      "Test size: 12000\n"
     ]
    }
   ],
   "source": [
    "train_size = int(len(df) * .7)\n",
    "print (\"Train size: %d\" % train_size)\n",
    "print (\"Test size: %d\" % (len(df) - train_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_posts = df['post'][:train_size]\n",
    "train_tags = df['tags'][:train_size]\n",
    "\n",
    "test_posts = df['post'][train_size:]\n",
    "test_tags = df['tags'][train_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_words = 1000\n",
    "tokenize = text.Tokenizer(num_words=max_words, char_level=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenize.fit_on_texts(train_posts) # only fit on train\n",
    "x_train = tokenize.texts_to_matrix(train_posts)\n",
    "x_test = tokenize.texts_to_matrix(test_posts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "encoder = LabelEncoder()\n",
    "encoder.fit(train_tags)\n",
    "y_train = encoder.transform(train_tags)\n",
    "y_test = encoder.transform(test_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_classes = np.max(y_train) + 1\n",
    "y_train = utils.to_categorical(y_train, num_classes)\n",
    "y_test = utils.to_categorical(y_test, num_classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x_train shape: (28000, 1000)\n",
      "x_test shape: (12000, 1000)\n",
      "y_train shape: (28000, 20)\n",
      "y_test shape: (12000, 20)\n"
     ]
    }
   ],
   "source": [
    "print('x_train shape:', x_train.shape)\n",
    "print('x_test shape:', x_test.shape)\n",
    "print('y_train shape:', y_train.shape)\n",
    "print('y_test shape:', y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 32\n",
    "epochs = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the model\n",
    "model = Sequential()\n",
    "model.add(Dense(512, input_shape=(max_words,)))\n",
    "model.add(Activation('relu'))\n",
    "model.add(Dropout(0.5))\n",
    "model.add(Dense(num_classes))\n",
    "model.add(Activation('softmax'))\n",
    "\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25200 samples, validate on 2800 samples\n",
      "Epoch 1/2\n",
      "25200/25200 [==============================] - 11s 442us/step - loss: 1.0261 - acc: 0.7180 - val_loss: 0.6658 - val_acc: 0.7975\n",
      "Epoch 2/2\n",
      "25200/25200 [==============================] - 11s 434us/step - loss: 0.5675 - acc: 0.8190 - val_loss: 0.6625 - val_acc: 0.7868\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(x_train, y_train,\n",
    "                    batch_size=batch_size,\n",
    "                    epochs=epochs,\n",
    "                    verbose=1,\n",
    "                    validation_split=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12000/12000 [==============================] - 1s 76us/step\n",
      "Test accuracy: 0.7955833333333333\n"
     ]
    }
   ],
   "source": [
    "score = model.evaluate(x_test, y_test,\n",
    "                       batch_size=batch_size, verbose=1)\n",
    "print('Test accuracy:', score[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
